FUNOVA Preprocessing QC statistics ¶

January 2025¶

In [1]:
import os
NOVA_HOME = "/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA"
NOVA_DATA_HOME = '/home/labs/hornsteinlab/Collaboration/FUNOVA'

LOGS_PATH = os.path.join(NOVA_DATA_HOME, "outputs/preprocessing/logs/")
PLOT_PATH = os.path.join(NOVA_DATA_HOME, "outputs/logs/")

os.chdir(NOVA_HOME)

import pandas as pd
import contextlib
import io
from IPython.display import display, Javascript
import seaborn as sns
from tools.preprocessing_tools.qc_reports.qc_utils import log_files_qc, run_validate_folder_structure, display_diff, sample_and_calc_variance, \
                                                show_site_survival_dapi_brenner, show_site_survival_dapi_cellpose, \
                                                show_site_survival_dapi_tiling, show_site_survival_target_brenner, \
                                                calc_total_sums, plot_filtering_heatmap, show_total_sum_tables, \
                                                plot_cell_count, plot_catplot, plot_hm_combine_batches, plot_hm, \
                                                run_calc_hist_new, plot_marker_data
                                                
from tools.preprocessing_tools.qc_reports.qc_config import (
    funova_markers as markers,
    funova_cell_lines as cell_lines,
    funova_cell_lines_to_cond as cell_lines_to_cond,
    funova_cell_lines_for_disp as cell_lines_for_disp,
    funova_reps as reps,
    funova_line_colors as line_colors,
    funova_lines_order as lines_order,
    funova_custom_palette as custom_palette,
    funova_expected_dapi_raw as expected_dapi_raw,
    funova_panels as panels,
    funova_marker_info as marker_info
)

%load_ext autoreload
%autoreload 2
In [2]:
# choose batches
batches = ['Batch1', 'Batch2']#, 'batch2', 'batch3']
batches
Out[2]:
['Batch1', 'Batch2']

I have created a folder called 'Batch1' in the logs dir and put all files inside

In [3]:
validate_antibody = False
In [4]:
df = log_files_qc(LOGS_PATH, batches, only_wt_cond = False)
reading logs of Batch1

Total of 1 files were read.
Before dup handeling  (182036, 21)
After duplication removal #1: (182036, 22)
After duplication removal #2: (182036, 22)
In [5]:
df['filename'] = df['filename'].str.split('-').str[0]
df['site_num'] = df['site_num'].str.split('-').str[0]
In [6]:
df_dapi = df[df.marker=='DAPI']
df_target = df[df.marker!='DAPI']

Actual Files Validation¶

Raw Files Validation¶

  1. How many site tiff files do we have in each folder?
  2. Are all existing files valid? (tif, at least 2049kB, not corrupetd)
In [7]:
root_directory_raw = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'raw')
In [8]:
# import os
# import pandas as pd

# # Define the root path
# root_path = "/home/labs/hornsteinlab/Collaboration/FUNOVA/input/images/raw/"

# # Initialize a dictionary to store the data
# data = {}

# # Walk through the directory structure
# for dirpath, dirnames, filenames in os.walk(root_path):
#     # Check if the current path contains 'repX' and a subfolder for the marker
#     if os.path.basename(os.path.dirname(dirpath)).startswith("rep"):  # Check for 'repX' in parent folder
#         marker = os.path.basename(dirpath)  # The marker is the current folder
#         panel_folder = os.path.basename(os.path.dirname(os.path.dirname(os.path.dirname(dirpath))))  # Three levels up for the panel
#         if panel_folder.lower().startswith("panel"):  # Ensure it's a panel folder
#             panel = panel_folder.replace("panel", "").strip()
            
#             # Add marker and panel information
#             if marker not in data:
#                 data[marker] = {"Antibody": [], "panel": []}  # Initialize marker entry
#             data[marker]["panel"].append(panel)  # Add panel to the marker

# # Convert the data to a DataFrame
# final_marker_info = pd.DataFrame.from_dict(data, orient="index")

# # Ensure 'panel' and 'Antibody' are arrays and unique panels
# final_marker_info["panel"] = final_marker_info["panel"].apply(lambda x: sorted(set(x)))  # Remove duplicates and sort
# final_marker_info["Antibody"] = [[] for _ in range(len(final_marker_info))]  # Ensure Antibody is an empty array
In [9]:
batches_raw = [batch.replace("_16bit_no_downsample","") for batch in batches]
raws = run_validate_folder_structure(root_directory_raw, False, panels, markers.copy(), PLOT_PATH, marker_info,
                                    cell_lines_to_cond, reps, cell_lines_for_disp, expected_dapi_raw,
                                     batches=batches_raw, fig_width=8,fig_height = 40,
                                    expected_count=100, validate_antibody = validate_antibody)
Batch1
Folder structure is valid.
No bad files are found.
Total Sites:  118400
========
Batch2
Folder structure is valid.
No bad files are found.
Total Sites:  118400
========
====================
In [10]:
# ## Missing data issue was fixed

# differences = (raws[0] != raws[1]).stack()
# differences = differences[differences].index.to_frame(index=False)
# differences.columns = ["Marker", "Rep", "Condition"]
# for condition in differences["Condition"].unique():
#     print(f"Condition: {condition}")
#     condition_data = differences[differences["Condition"] == condition]
#     for rep in condition_data["Rep"].unique():
#         markers = condition_data[condition_data["Rep"] == rep]["Marker"].tolist()
#         print(f"  Rep: {rep}")
#         print(f"    Markers: {', '.join(markers)}")

Processed Files Validation¶

  1. How many site npy files do we have in each folder? -> How many sites survived the pre-processing?
  2. Are all existing files valid? (at least 100kB, npy not corrupted)
In [11]:
root_directory_proc = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'processed')
procs = run_validate_folder_structure(root_directory_proc, True, panels, markers, PLOT_PATH, marker_info,
                                    cell_lines_to_cond, reps, cell_lines_for_disp, expected_dapi_raw,
                                     batches=batches, fig_width=8,fig_height = 40,
                                    expected_count=100, validate_antibody = validate_antibody)
Batch1
Folder structure is valid.
No bad files are found.
Total Sites:  91000
========
Batch2
Folder structure is invalid. Missing 1 paths:
/home/labs/hornsteinlab/Collaboration/FUNOVA/input/images/processed/Batch2/C9orf72-HRE/stress/Neuronal_activity
No bad files are found.
Total Sites:  77986
========
====================

Difference between Raw and Processed¶

In [12]:
display_diff(batches, raws, procs, PLOT_PATH, fig_width=8, fig_height = 40)
Batch1
========
Batch2
========

Variance in each batch (of processed files)¶

In [31]:
for batch in batches:
    with contextlib.redirect_stdout(io.StringIO()):
        var = sample_and_calc_variance(root_directory_proc, batch, 
                                       sample_size_per_markers=200, cond_count=1, rep_count=len(reps), 
                                       num_markers=len(markers))
    print(f'{batch} var: ',var)
Batch1 var:  0.01778099779891865
Batch2 var:  0.018942129863741438

Preprocessing Filtering qc¶

By order of filtering

1. % site survival after Brenner on DAPI channel¶

Percentage out of the total sites

In [14]:
dapi_filter_by_brenner = show_site_survival_dapi_brenner(df_dapi,batches, line_colors, panels,
                                                        figsize=(6,18), reps=reps, vmax=100)

2. % Site survival after Cellpose¶

Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.

A site will be filtered out if Cellpose found 0 cells in it.

In [15]:
dapi_filter_by_cellpose = show_site_survival_dapi_cellpose(df_dapi, batches, dapi_filter_by_brenner, 
                                                           line_colors, panels, figsize=(6,18), reps=reps)

3. % Site survival by tiling¶

Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.

A site will be filtered out if after tiling, no tile is containing at least 85% of a cell that Cellpose detected.

In [16]:
dapi_filter_by_tiling=show_site_survival_dapi_tiling(df_dapi, batches, dapi_filter_by_cellpose, 
                                                     line_colors, panels, figsize=(6,18), reps=reps)

4. % Site survival after Brenner on target channel¶

Percentage out of the sites that passed the previous filter. In parenthesis are absolute values (if different than the percentages).

In [17]:
show_site_survival_target_brenner(df_dapi, df_target, dapi_filter_by_tiling,
                                 figsize=(6,24), markers=markers)

Statistics About the Processed Files¶

In [18]:
names = ['Total number of tiles', 'Total number of whole cells']
stats = ['n_valid_tiles','site_whole_cells_counts_sum','site_cell_count','site_cell_count_sum']
total_sum = calc_total_sums(df_target, df_dapi, stats, markers)
In [19]:
plot_marker_data(total_sum, split_by_cell_line=True)
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(
In [20]:
plot_marker_data(total_sum, split_by_cell_line=False)
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1304: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  sns.barplot(

Total tiles¶

In [21]:
total_sum.n_valid_tiles.sum()
Out[21]:
955381

Total whole nuclei in tiles¶

In [22]:
total_sum[total_sum.marker =='DAPI'].site_whole_cells_counts_sum.sum()
Out[22]:
411684.0

Total nuclei in sites¶

In [23]:
total_sum[total_sum.marker =='DAPI'].site_cell_count.sum()
Out[23]:
1106239.0
In [24]:
show_total_sum_tables(total_sum)
n_valid_tiles % valid tiles site_whole_cells_counts_sum site_cell_count
Batch1
count 587.000000 587.000000 587.000000 5.870000e+02
mean 958.589438 9.585894 1165.614991 3.118043e+03
std 979.237781 9.792378 1225.856911 3.556683e+03
min 1.000000 0.010000 1.000000 1.000000e+00
25% 268.500000 2.685000 309.000000 7.505000e+02
50% 525.000000 5.250000 619.000000 1.492000e+03
75% 1111.000000 11.110000 1303.500000 2.809000e+03
max 3736.000000 37.360000 4768.000000 1.391200e+04
sum 562692.000000 NaN 684216.000000 1.830291e+06
expected_count 450.000000 450.000000 450.000000 4.500000e+02
n_valid_tiles % valid tiles site_whole_cells_counts_sum site_cell_count
Batch2
count 587.000000 587.000000 587.000000 5.870000e+02
mean 668.976150 6.689761 824.144804 2.182400e+03
std 746.778862 7.467789 948.683856 2.795358e+03
min 1.000000 0.010000 1.000000 1.000000e+00
25% 178.500000 1.785000 207.000000 4.530000e+02
50% 337.000000 3.370000 408.000000 9.070000e+02
75% 824.000000 8.240000 915.500000 2.273000e+03
max 3645.000000 36.450000 4537.000000 1.366000e+04
sum 392689.000000 NaN 483773.000000 1.281069e+06
expected_count 450.000000 450.000000 450.000000 4.500000e+02
n valid tiles % valid tiles site_whole_cells_counts_sum site_cell_count
All batches
count 1174.000000 1174.000000 1.174000e+03 1.174000e+03
mean 813.782794 8.137828 9.948799e+02 2.650221e+03
std 882.401856 8.824019 1.108835e+03 3.231459e+03
min 1.000000 0.010000 1.000000e+00 1.000000e+00
25% 225.000000 2.250000 2.572500e+02 5.775000e+02
50% 430.500000 4.305000 5.045000e+02 1.166000e+03
75% 1004.500000 10.045000 1.172500e+03 2.724750e+03
max 3736.000000 37.360000 4.768000e+03 1.391200e+04
sum 955381.000000 NaN 1.167989e+06 3.111360e+06
expected_count 450.000000 450.000000 4.500000e+02 4.500000e+02

Show Total Tile Counts¶

For each batch, cell line, replicate and markerTotal number of tiles

In [25]:
to_heatmap = total_sum.rename(columns={'n_valid_tiles':'index'})
plot_filtering_heatmap(to_heatmap, extra_index='marker', vmin=None, vmax=None,
                      xlabel = 'Total number of tiles', show_sum=True, figsize=(6,24))
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:386: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=10)
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:386: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=10)
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:386: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=10)
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:386: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=10)

Show Total Whole Cell Counts¶

For each batch, cell line, replicate and markerTotal number of tiles

In [26]:
to_heatmap = total_sum.rename(columns={'site_whole_cells_counts_sum':'index'})
plot_filtering_heatmap(to_heatmap, extra_index='marker', vmin=None, vmax=None,
                      xlabel = 'Total number of whole cells', show_sum=True, figsize=(6,24))
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:386: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=10)
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:386: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=10)
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:386: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=10)
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:386: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=10)

Show Cell Count Statistics per Batch¶

In [27]:
df_no_empty_sites = df_dapi[df_dapi.n_valid_tiles !=0]
plot_cell_count(df_no_empty_sites, lines_order, custom_palette, y='site_cell_count_sum', 
                title='Cell Count Average per Site (from tiles)', figsize=(16,6))

plot_cell_count(df_no_empty_sites, lines_order, custom_palette, y='site_whole_cells_counts_sum',
                title='Whole Cell Count Average per Site',figsize=(16,6))

plot_cell_count(df_no_empty_sites, lines_order, custom_palette, y='site_cell_count',
               title='Cellpose Cell Count Average per Site',figsize=(16,6))

Show Tiles per Site Statistics¶

In [28]:
df_dapi.groupby(['cell_line_cond']).n_valid_tiles.mean()
Out[28]:
cell_line_cond
C9orf72-HRE Untreated      3.509666
C9orf72-HRE stress         3.177155
Control Untreated          6.673802
Control stress             5.215105
TDP--43-G348V Untreated    2.422217
TDP--43-G348V stress       1.821556
TDP--43-N390D Untreated    4.763732
TDP--43-N390D stress       3.445943
Name: n_valid_tiles, dtype: float64
In [29]:
plot_catplot(df_dapi, sns.color_palette('colorblind'), reps=reps,x='cell_line', y_title='Valid Tiles Count', x_title='Cell Line', y='n_valid_tiles', hue='rep',
             height=4, aspect=2)
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1024: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'batch_rep'] = df['batch'] + " " + df['rep']
/home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1035: UserWarning: The palette list has more values (10) than needed (2), which may not be intended.
  g = sns.catplot(kind='box', data=df, y=y, x=x,height=height, aspect=aspect, hue=hue, palette=palette,

Show Mean of cell count in valid tiles¶

In [30]:
# plot_hm(df_dapi, split_by='rep', rows='cell_line', columns='panel', vmax=3)

Assessing Staining Reproducibility and Outliers¶

In [31]:
# for batch in batches:
#     print(batch)
#     run_calc_hist_new(batch,cell_lines_for_disp, markers, root_directory_raw, root_directory_proc,
#                            hist_sample=10,sample_size_per_markers=10, ncols=4, nrows=1, figsize=(6,2))
#     print("="*30)
    
In [32]:
# save notebook as HTML ( the HTML will be saved in the same folder the original script is)
display(Javascript('IPython.notebook.save_checkpoint();'))
os.system(f'jupyter nbconvert --to html tools/preprocessing_tools/qc_reports/qc_report_funova.ipynb --output {NOVA_HOME}/manuscript/preprocessing_qc_reports/qc_report_funova.html')
[NbConvertApp] Converting notebook tools/preprocessing_tools/qc_reports/qc_report_funova.ipynb to html
[NbConvertApp] Writing 7536693 bytes to /home/labs/hornsteinlab/Collaboration/NOVA_GAL/NOVA/manuscript/preprocessing_qc_reports/qc_report_funova.html
Out[32]:
0